{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "88c355af",
      "metadata": {
        "id": "2eec5cc39a59"
      },
      "outputs": [],
      "source": [
        "# Copyright 2024 Google LLC\n",
        "#\n",
        "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "# you may not use this file except in compliance with the License.\n",
        "# You may obtain a copy of the License at\n",
        "#\n",
        "#     https://www.apache.org/licenses/LICENSE-2.0\n",
        "#\n",
        "# Unless required by applicable law or agreed to in writing, software\n",
        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "# See the License for the specific language governing permissions and\n",
        "# limitations under the License."
      ]
    },
    {
      "cell_type": "markdown",
      "id": "4a87f86f",
      "metadata": {
        "id": "31ab8a519de3"
      },
      "source": [
        "| | |\n",
        "|-|-|\n",
        "|Author(s) | [Valentin Huerta](https://github.com/valentinhuerta1996) |\n",
        "||  [Ulises Jimenez](https://github.com/ulises-jimenez07) |"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "id": "6351524b-956c-4745-b651-3052f0181357",
      "metadata": {
        "id": "fec5d5b34a0a"
      },
      "outputs": [],
      "source": [
        "from entity_processor import DocumentAIEntityExtractor, ModelBasedEntityExtractor\n",
        "from extractor import OnlineDocumentExtractor\n",
        "from prompts_module import get_compare_entities_prompt, get_extract_entities_prompt\n",
        "from temp_file_uploader import TempFileUploader\n",
        "from vertexai.generative_models import GenerativeModel"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "id": "a392e37a-e731-44ce-b81c-0db71395636a",
      "metadata": {
        "id": "06e3d60cf1f2"
      },
      "outputs": [],
      "source": [
        "project_id = \"project-id\"\n",
        "location = \"us\"  # Or other supported locations like 'eu'\n",
        "processor_id = \"processor-id\"\n",
        "processor_version_id = \"processor-version-id\"  # Optional for batch processing\n",
        "# File to process\n",
        "file_path = \"test_file.pdf\"\n",
        "mime_type = \"application/pdf\"\n",
        "\n",
        "gcs_output_uri = \"gs://bucket-output\"  # GCS URI for output\n",
        "gcs_temp_uri = \"gs://bucket-temp\"  # GCS URI for output"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "id": "3a2815a0-1f57-4ca1-9818-19fa3a384300",
      "metadata": {
        "id": "b5253c7b3285"
      },
      "outputs": [],
      "source": [
        "online_extractor = OnlineDocumentExtractor(\n",
        "    project_id=project_id,\n",
        "    location=location,\n",
        "    processor_id=processor_id,\n",
        "    # processor_version_id=processor_version_id\n",
        ")\n",
        "online_document = online_extractor.process_document(file_path, mime_type)\n",
        "\n",
        "docai_entity_extractor = DocumentAIEntityExtractor(online_document)\n",
        "docai_entities = docai_entity_extractor.extract_entities()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "id": "fd4b0dd2-325d-4a13-b741-b1063098c2f3",
      "metadata": {
        "id": "9788574b815b"
      },
      "outputs": [],
      "source": [
        "docai_entities"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "id": "5768a5dc-0ebf-4891-992f-6fbdea315b97",
      "metadata": {
        "id": "06c441a872d9"
      },
      "outputs": [],
      "source": [
        "temp_file_uploader = TempFileUploader(gcs_temp_uri)\n",
        "gcs_input_uri = temp_file_uploader.upload_file(file_path)\n",
        "\n",
        "prompt_extract = get_extract_entities_prompt()\n",
        "model_extractor = ModelBasedEntityExtractor(\n",
        "    \"gemini-2.0-flash\", prompt_extract, gcs_input_uri\n",
        ")\n",
        "gemini_entities = model_extractor.extract_entities()\n",
        "\n",
        "temp_file_uploader.delete_file()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "id": "02e0a0d3-11da-4673-82dc-bed9efac27f8",
      "metadata": {
        "id": "aebb108dc010"
      },
      "outputs": [],
      "source": [
        "compare_prompt = get_compare_entities_prompt()\n",
        "compare_prompt = compare_prompt.format(\n",
        "    docai_output=str(docai_entities), gemini_output=str(gemini_entities)\n",
        ")\n",
        "\n",
        "model = GenerativeModel(\"gemini-2.0-flash\")\n",
        "docai_gemini_response_analysis = model.generate_content(compare_prompt)\n",
        "print(docai_gemini_response_analysis.text)"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "name": "test_document_ai_gemini.ipynb",
      "toc_visible": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
